library(pdftools)
library(tidyverse)
library(magrittr)
library(stringi)
library(tidytext)
library(igraph)
library(ggraph)
library(ggplot2)
Loading data
ocpt %>% glimpse()
Rows: 7,424
Columns: 6
Groups: orig_id, organization, fullpath, gcode [697]
$ orig_id <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ organization <chr> "1&1 Drillisch AG", "1&1 Drillisch AG", "1&1 Drillisch AG", "1&1 Drillisch AG", "1&1 Drillisch AG", "1&1 Drillisch AG", "1&1 Drillisch AG", "1&1 Drillis…
$ fullpath <chr> "reports/11_drillisch_ag/2020/sustainability_report_2020.pdf", "reports/11_drillisch_ag/2020/sustainability_report_2020.pdf", "reports/11_drillisch_ag/2…
$ gcode <dbl> 401, 401, 403, 403, 404, 405, 405, 405, 405, 406, 406, 408, 409, 414, 414, 414, 416, 417, 417, 417, 418, 418, 999, 999, 999, 999, 999, 999, 999, 999, 99…
$ pdf_page <dbl> 38, 46, 48, 50, 41, 44, 9, 46, 48, 44, 45, 68, 68, 67, 68, 66, 25, 33, 55, 35, 28, 30, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 2…
$ text <chr> "1&1 DRILLISCH AG SUSTAINABILITY REPORT 2019\n38\nRESULTS AND ASSESSMENT\nWe use a range of performance indicators to measure the effectiveness of our h…
Prepare sets
#for modelling
ocpt2 <- ocpt %>% ungroup()%>% distinct(fullpath, pdf_page, .keep_all = TRUE)
#for text mining
ocpt3 <- ocpt %>% filter(!gcode=="999") %>% ungroup() %>% distinct(fullpath, pdf_page, .keep_all = TRUE)
ocpt3 %<>% mutate(doc_id=paste0(orig_id, ", ", gcode))
Annotating text
library(udpipe)
udmodel <- udpipe_load_model(file = "english-ewt-ud-2.5-191206.udpipe")
up <- udpipe_annotate(udmodel, x = ocpt3$text, doc_id = ocpt3$doc_id)
up <- as.data.frame(up)
Making a list of verbs from available texts
stats <- subset(up, upos %in% c("VERB"))
stats <- txt_freq(stats$lemma)
stats %>% head(50)
write.csv(stats, "verbs.csv")
Uploading a list of grouped verbs
To find synonims for English words we can use the WordNet and its R wrapper in the wordnet package. (https://bernhardlearns.blogspot.com/2017/04/cleaning-words-with-r-stemming.html)
verbs %<>% str_trim(verb)
Error in match.arg(side) : object 'verb' not found
#verbs_qp <- verbs %>% filter(character=="QN") %>% select(verb)
up2 <- up %>% subset(upos %in% c("VERB"))
up2 %<>% subset(lemma %in% as.vector(verbs$verb))
up2 %<>% merge(txt_freq(up2$lemma), by.x= "lemma", by.y = "key")
# up2 %<>% count(lemma) %>% mutate(freq = n / sum(n)) %>% arrange(desc(n))
up2 %<>% left_join(verbs, by=c("lemma"="verb"))
up2 %<>% unique()
up2 %<>% separate(doc_id, c("doc", "gcode"), sep = ", ")
# verbs %<>% merge(up2, by.x = "verb", by.y="lemma")
up2 %>%
distinct(lemma, .keep_all = T) %>%
group_by(character) %>% arrange(desc(freq)) %>% slice(1:12) %>% ungroup() %>%
mutate(verb = reorder_within(lemma, by = freq, within = character)) %>%
ggplot(aes(x = verb, y = freq, fill = character)) +
geom_col(show.legend = FALSE) +
labs(x = "Verbs", y = "frequency") +
facet_wrap(~character, ncol = 3, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
theme(axis.text.y = element_text(size = 6))

up2 %>%
distinct(lemma, .keep_all = T) %>%
group_by(character) %>% summarize(sum=sum(freq)) %>% #arrange(desc(sum)) %>%
ggplot(aes(x = character, y = sum)) +
geom_bar(stat = "sum",show.legend = FALSE)

up2 %>%
group_by(gcode,character) %>% count(lemma) %>% summarize(sum=sum(n), .groups = "keep") %>%
mutate(character = reorder_within(character, by = sum, within = gcode)) %>%
ggplot(aes(x = character, y = sum, fill = gcode)) +
geom_col(show.legend = FALSE) +
labs(x = "Verbs", y = "frequency") +
facet_wrap(~gcode, ncol = 4, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
theme(axis.text.y = element_text(size = 6))

verb_phrase_simp <- "((A|N)*N(P+D*(A|N)*N)*P*(M|V)*V(M|V)*|(M|V)*V(M|V)*D*(A|N)*N(P+D*(A|N)*N)*|(M|V)*V(M|V)*(P+D*(A|N)*N)+|(A|N)*N(P+D*(A|N)*N)*P*((M|V)*V(M|V)*D*(A|N)*N(P+D*(A|N)*N)*|(M|V)*V(M|V)*(P+D*(A|N)*N)+))" # Simple verb Phrase
verb_phrase_with_cc <- "(((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)(P(CP)*)*(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*(D(CD)*)*((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)+|((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)(P(CP)*)*((M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*(D(CD)*)*((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)+))" # Verb phrase with coordination conjunction
up3 <- up %>% subset(sentence %in% up2$sentence)
up3 %<>% separate(doc_id, c("doc", "gcode"), sep = ", ")
up3 %<>% mutate(phrase_tag=as_phrasemachine(upos,type="upos"))
verb_phrases <- keywords_phrases(up3$phrase_tag, term = up3$token,
pattern = verb_phrase_simp, is_regex = TRUE,
ngram_max = 5,
detailed = TRUE)
head(sort(table(verb_phrases$keyword), decreasing=TRUE), 20)
we have that are WE GET WE GET THINGS
146 138 136 136
WE GET THINGS DONE employees are it is which is
130 126 124 117
We are organization shall organization shall report organization shall report its
108 102 102 102
organization shall report its management report its report its management report its management approach
102 102 102 102
reporting organization shall reporting organization shall report reporting organization shall report its shall report its
102 102 102 102
verb_phrases <- lemmed %>%
group_by(start, end) %>%
summarise(string = c(word)) %>%
rowwise()
nest_by(ngram, pattern, start, end)
Trying to find cooccurrences amoung slected verbs and nouns — some uncertainty introduced since lemma can be both noun and a verb.
cooc <- cooccurrence(x = subset(up3, upos %in% c("NOUN", "VERB")),
term = "lemma",
group = c("doc", "gcode", "paragraph_id", "sentence_id"))
cooc %<>% subset(term1 %in% verbs$verb | term2 %in% verbs$verb)
cooc %>% str()
Classes ‘cooccurrence’, ‘data.table’ and 'data.frame': 65771 obs. of 3 variables:
$ term1: chr "impact" "page" "employee" "employee" ...
$ term2: chr "rights" "risk" "risk" "provide" ...
$ cooc : num 1063 955 915 871 741 ...

Top 5 verbs in every verb group
Most cooccured nouns with verbs by character in 401 code

Top Nouns cooccured with verbs by every verb-character

library(igraph)
library(ggraph)
library(ggplot2)
plot_annotation <- function(x, size = 3){
stopifnot(is.data.frame(x) & all(c("sentence_id", "token_id", "head_token_id", "dep_rel",
"token", "lemma", "upos", "xpos", "feats") %in% colnames(x)))
x <- x[!is.na(x$head_token_id), ]
x <- x[x$sentence_id %in% min(x$sentence_id), ]
edges <- x[x$head_token_id != 0, c("token_id", "head_token_id", "dep_rel")]
edges$label <- edges$dep_rel
g <- graph_from_data_frame(edges,
vertices = x[, c("token_id", "token", "lemma", "upos", "xpos", "feats")],
directed = TRUE)
ggraph(g, layout = "linear") +
geom_edge_arc(ggplot2::aes(label = dep_rel, vjust = -0.20),
arrow = grid::arrow(length = unit(4, 'mm'), ends = "last", type = "closed"),
end_cap = ggraph::label_rect("wordswordswords"),
label_colour = "red", check_overlap = TRUE, label_size = size) +
geom_node_label(ggplot2::aes(label = token), col = "darkgreen", size = size, fontface = "bold") +
geom_node_text(ggplot2::aes(label = upos), nudge_y = -0.35, size = size) +
theme_graph(base_family = "Arial Narrow") +
labs(title = "udpipe output", subtitle = "tokenisation, parts of speech tagging & dependency relations")
}
plot_annotation(up3[645:663,])


up3[645:663,]
NA
undata2 <- pre_pro_rep %>%
unnest_tokens(bigram,text,token = "ngrams", n=2) #%>%
#mutate(bigram = bigram %>% str_remove_all("[^[:alnum:]]")) %>%
undata1 <- pre_pro_rep %>%
unnest_tokens(word,text,token = "ngrams", n=1) %>%
mutate(word = word %>% str_remove_all("[^[:alnum:]]")) %>%
mutate(word = word %>% str_remove_all(rem_dig)) %>%
mutate(word = word %>% str_remove_all("null")) %>%
filter(!is.na(word)) %>%
anti_join(stop_words, by = "word")
bigrams_separated <- undata2 %>% separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ")
bigrams_united %<>% add_count(X,bigram) %>% bind_tf_idf(term = bigram, document = X, n = n)
words <- undata1 %>% add_count(X,word) %>% bind_tf_idf(term = word, document = X, n = n)
bigrams_united %>% count(bigram, wt = tf_idf, sort = TRUE) %>% head(25)
top_by_g <- bigrams_united %>% group_by(gcode) %>% count(bigram, wt = tf_idf, sort = TRUE, name = "tf_idf") %>% dplyr::slice(1:12) %>% ungroup()
top_by_g <- words %>% group_by(gcode) %>% count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>% dplyr::slice(1:5) %>% ungroup()
top_by_y <- bigrams_united %>% group_by(rep_year) %>% count(bigram, wt = tf_idf, sort = TRUE, name = "tf_idf") %>% dplyr::slice(1:12) %>% ungroup()
# top bigrams in each g_code
top_by_g %>% mutate(bigram = reorder_within(bigram, by = tf_idf, within = gcode)) %>%
ggplot(aes(x = bigram, y = tf_idf, fill = gcode)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gcode, ncol = 3, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme(axis.text.y = element_text(size = 6))
top_by_g %>% mutate(word = reorder_within(word, by = tf_idf, within = gcode)) %>%
ggplot(aes(x = word, y = tf_idf, fill = gcode)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gcode, ncol = 3, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme(axis.text.y = element_text(size = 8))
library(textdata)
library(recipes)
library(tidymodels)
library(textrecipes)
library(themis)
library(tune)
library(glmnet)
library(ranger)
mdata <- ocpt2 %>% ungroup() %>% select(gcode,text)
#glove6b <- embedding_glove6b(dimensions = 100)
set.seed(1234)
rem_punct <- regex("[[:punct:]]")
rem_dig <- regex("[[:digit:]]")
mdata %<>% mutate(text=str_squish(text),
text=str_remove_all(text,rem_punct),
text=str_remove_all(text,rem_dig))
mdata <-mdata[-c(no_label_index[c(random_indexes)])]
Error: Can't negate columns that don't exist.
x Locations 3265, 6190, 3919, 195, 4845, etc. don't exist.
ℹ There are only 2 columns.
Run `rlang::last_error()` to see where the error occurred.
write.csv(mdata, "mdata_2.csv")
tidy_split <- initial_split(mdata, strata = gcode, prop = 0.7)
train_data <- training(tidy_split)
test_data <- testing(tidy_split)
tidy_split
<Analysis/Assess/Total>
<662/287/949>
train_data <- recipe(gcode~., data = train_data) %>% themis::step_upsample(gcode) %>% prep() %>% juice()
train_data %<>% mutate(text=as.character(text))
data_res <- train_data %>% vfold_cv(strata = gcode, v = 10, repeats = 3)
data_res <- vfold_cv(train_data)
tf_idf_rec <- recipe(gcode ~ ., data = train_data) %>%
step_tokenize(text) %>%
step_stem(text) %>%
step_stopwords(text) %>%
step_tokenfilter(text, max_tokens = 2000) %>%
step_tfidf(all_predictors())
tf_idf_data <- tf_idf_rec %>% prep() %>% juice()
hash_rec <- recipe(gcode~., data = train_data) %>%
step_tokenize(text) %>%
step_stem(text) %>%
step_stopwords(text) %>%
step_tokenfilter(text, max_tokens = 1000) %>%
step_texthash(text, num_terms = 100)
hash_rec %>% prep() %>% juice()
NA
model_lg <- multinom_reg() %>%
set_args(penalty=tune(), mixture=NULL) %>%
set_engine("glmnet") %>%
set_mode("classification")
model_rf <- rand_forest() %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
logistic_grid <- grid_regular(parameters(model_lg), levels = 3)
model_control <- control_grid(save_pred = TRUE)
model_metrics <- metric_set(accuracy, roc_auc)
linear_tf_res <- tune_grid(model_lg, tf_idf_rec, grid = logistic_grid, control = model_control, metrics = model_metrics, resamples = data_res)
linear_hash_res <- tune_grid(model_lg, hash_rec, grid = logistic_grid, control = model_control, metrics = model_metrics, resamples = data_res)
workflow_general_tf <- workflow() %>% add_recipe(tf_idf_rec)
workflow_lg_tf <- workflow_general_tf %>% add_model(model_lg)
workflow_rf_tf <- workflow_general_tf %>% add_model(model_rf)
workflow_general_hash <- workflow() %>% add_recipe(hash_rec)
workflow_lg_hash <- workflow_general_hash %>% add_model(model_lg)
workflow_rf_hash <- workflow_general_hash %>% add_model(model_rf)
linear_tf_res %>% autoplot()

best_param_linear_tf_res <- linear_tf_res %>% select_best(metric = 'accuracy')
best_param_linear_tf_res
workflow_final_lg_tf <- workflow_lg_tf %>%
finalize_workflow(parameters = best_param_linear_tf_res)
log_res_tf <- workflow_final_lg_tf %>%
fit_resamples(resamples = data_res,
metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens), control = control_resamples(save_pred = TRUE))
log_res_tf %>% collect_metrics(summarize = TRUE)
linear_hash_res %>% autoplot()

best_param_linear_hash_res <- linear_hash_res %>% select_best(metric = 'accuracy')
best_param_linear_hash_res
workflow_final_lg_hash <- workflow_lg_hash %>%
finalize_workflow(parameters = best_param_linear_hash_res)
log_res_hash <- workflow_final_lg_hash %>%
fit_resamples( resamples = data_res,
metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens), control = control_resamples(save_pred = TRUE))
log_res_hash %>% collect_metrics(summarize = TRUE)
rf_res_hash <- workflow_rf_hash %>%
fit_resamples(resamples = data_res,
metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens),
control = control_resamples( save_pred = TRUE))
rf_res_hash %>% collect_metrics(summarize = TRUE)
NA
rf_res_tf <- workflow_rf_tf %>%
fit_resamples(resamples = data_res,
metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens),
control = control_resamples( save_pred = TRUE))
rf_res_tf %>% collect_metrics(summarize = TRUE)
NA
log_metrics_tf <- log_res_tf %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Logistic Regression TF-idf")
log_metrics_hash <- log_res_hash %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Logistic Regression Hash")
rf_metrics_tf <- rf_res_tf %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Random Forest TF-idf")
rf_metrics_hash <- rf_res_hash %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Random Forest Hash")
model_compare <- bind_rows(
log_metrics_tf,
log_metrics_hash,
rf_metrics_tf,
rf_metrics_hash)
rf_stat1 <- rf_res_tf %>% collect_metrics(summarize = TRUE)
rf_stat1
model_comp <- model_compare %>%
select(model, .metric, mean, std_err) %>%
pivot_wider(names_from = .metric, values_from = c(mean, std_err))
model_comp %>%
arrange(mean_f_meas) %>%
mutate(model = fct_reorder(model, mean_f_meas)) %>%
ggplot(aes(model, mean_f_meas, fill=model)) +
geom_col() + coord_flip() +
scale_fill_brewer(palette = "YlGn") +
geom_text(size = 3, aes(label = round(mean_f_meas, 2), y = mean_f_meas + 0.08), vjust = 1)

rf_pred_tf <- rf_res_tf %>%
collect_predictions()
rf_pred_tf %>% conf_mat(gcode, .pred_class)
Truth
Prediction 401 403 404 405 407 413 414 416 417 999
401 125 5 4 9 0 0 0 0 0 3
403 5 132 4 0 0 2 0 0 0 6
404 5 0 129 0 0 0 0 0 0 3
405 3 0 0 127 0 0 0 0 0 4
407 0 0 0 0 140 0 0 0 0 0
413 1 0 1 0 0 130 1 0 0 9
414 0 0 1 0 0 0 139 0 0 6
416 0 0 0 0 0 0 0 140 0 0
417 0 0 0 0 0 0 0 0 140 2
999 1 3 1 4 0 8 0 0 0 107
rf_pred_tf %>% conf_mat(gcode, .pred_class) %>% autoplot(type = "heatmap")


last_fit_rf <- last_fit(workflow_rf_tf,
split = tidy_split,
metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens))
rf_stat2 <- last_fit_rf %>% collect_metrics()
rf_stat2
last_fit_rf %>% collect_predictions() %>% conf_mat(gcode, .pred_class) %>% autoplot(type = "heatmap")

myt <- rf_stat1 %>% left_join(rf_stat2, by=".metric") %>% select(.metric, mean, std_err, .estimate)
myt
NA
#stargazer(myt, type = "latex", out = "models.tex")
stargazer(myt, type = "text")
===================================================
Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
===================================================
---
title: "R Notebook"
output: html_notebook
---


```{r}
library(pdftools)
library(tidyverse)
library(magrittr)
library(stringi)
library(tidytext)
library(igraph)
library(ggraph)
library(ggplot2)
```

## Loading data
```{r}
ocpt <- readRDS("orlist_coges_pages_text_unnested.rds")
ocpt %>% glimpse()
```
## Prepare sets
```{r}

#for modelling
ocpt2 <- ocpt %>% ungroup()%>% distinct(fullpath, pdf_page, .keep_all = TRUE)

#for text mining
ocpt3 <- ocpt %>% filter(!gcode=="999") %>% ungroup() %>% distinct(fullpath, pdf_page, .keep_all = TRUE)
ocpt3 %<>% mutate(doc_id=paste0(orig_id, ", ", gcode))
```

## Annotating text
```{r}
library(udpipe)
udmodel <- udpipe_load_model(file = "english-ewt-ud-2.5-191206.udpipe")

up <- udpipe_annotate(udmodel, x = ocpt3$text, doc_id = ocpt3$doc_id)
up <- as.data.frame(up)
```


## Making a list of verbs from available texts
```{r}

stats <- subset(up, upos %in% c("VERB")) 
stats <- txt_freq(stats$lemma)
stats %>% head(50)
write.csv(stats, "verbs.csv")

```
## Uploading a list of grouped verbs

To find synonims for English words we can use the WordNet and its R wrapper in the wordnet package. (https://bernhardlearns.blogspot.com/2017/04/cleaning-words-with-r-stemming.html)
```{r}
verbs<-read.csv("verbs_grouped.csv")

verbs %<>% mutate(verb=str_trim(verb))
verbs %<>% unique()
```

```{r}
#verbs_qp <- verbs %>% filter(character=="QN") %>% select(verb)

up2 <- up %>% subset(upos %in% c("VERB"))
up2 %<>% subset(lemma %in% as.vector(verbs$verb))
up2 %<>% merge(txt_freq(up2$lemma), by.x= "lemma", by.y = "key")
# up2 %<>% count(lemma) %>%  mutate(freq = n / sum(n)) %>%   arrange(desc(n))

up2  %<>%  left_join(verbs, by=c("lemma"="verb"))
up2 %<>% unique()
up2 %<>% separate(doc_id, c("doc", "gcode"), sep = ", ")
# verbs %<>% merge(up2, by.x = "verb", by.y="lemma")

```

```{r}
up2 %>%
  distinct(lemma, .keep_all = T) %>% 
  group_by(character) %>% arrange(desc(freq)) %>% slice(1:12) %>% ungroup() %>% 
  mutate(verb = reorder_within(lemma, by = freq, within = character)) %>%
  ggplot(aes(x = verb, y = freq, fill = character)) +
    geom_col(show.legend = FALSE) +
    labs(x = "Verbs", y = "frequency") +
    facet_wrap(~character, ncol = 3, scales = "free_y") +
    coord_flip() +
    scale_x_reordered() +
    theme(axis.text.y = element_text(size = 6))
```


```{r}

up2 %>% 
  distinct(lemma, .keep_all = T) %>% 
  group_by(character) %>% summarize(sum=sum(freq)) %>% #arrange(desc(sum)) %>% 
  ggplot(aes(x = character, y = sum)) +
    geom_bar(stat = "sum",show.legend = FALSE)

```


```{r}

up2 %>% 
  group_by(gcode,character) %>% count(lemma) %>% summarize(sum=sum(n), .groups = "keep") %>%
  mutate(character = reorder_within(character, by = sum, within = gcode)) %>%
  ggplot(aes(x = character, y = sum, fill = gcode)) +
    geom_col(show.legend = FALSE) +
    labs(x = "Verbs", y = "frequency") +
    facet_wrap(~gcode, ncol = 4, scales = "free_y") +
    coord_flip() +
    scale_x_reordered() +
    theme(axis.text.y = element_text(size = 6))
```
```{r}
verb_phrase_simp <- "((A|N)*N(P+D*(A|N)*N)*P*(M|V)*V(M|V)*|(M|V)*V(M|V)*D*(A|N)*N(P+D*(A|N)*N)*|(M|V)*V(M|V)*(P+D*(A|N)*N)+|(A|N)*N(P+D*(A|N)*N)*P*((M|V)*V(M|V)*D*(A|N)*N(P+D*(A|N)*N)*|(M|V)*V(M|V)*(P+D*(A|N)*N)+))" # Simple verb Phrase
verb_phrase_with_cc <- "(((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)(P(CP)*)*(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*(D(CD)*)*((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)+|((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)(P(CP)*)*((M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*(D(CD)*)*((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)+))" # Verb phrase with coordination conjunction
```

```{r}

up3 <- up %>% subset(sentence %in% up2$sentence)
up3 %<>% separate(doc_id, c("doc", "gcode"), sep = ", ")

up3 %<>% mutate(phrase_tag=as_phrasemachine(upos,type="upos"))

verb_phrases <- keywords_phrases(up3$phrase_tag, term = up3$token, 
                                pattern = verb_phrase_simp, is_regex = TRUE, 
                                ngram_max = 7, 
                                detailed = TRUE)

head(sort(table(verb_phrases$keyword), decreasing=TRUE), 20)
```


```{r}
library(textstem)
verb_phrases2 <-verb_phrases

lemmed <- verb_phrases %>% unnest_tokens(word,keyword,token = "ngrams", n=1) 
lemmed %<>% mutate(lemma=lemmatize_words(word, dictionary = lexicon::hash_lemmas))
lemmed_ss <- lemmed %>% subset(lemma %in% verbs$verb[verbs$character=="QP"])
verb_phrases2 %<>% subset(start %in% lemmed_ss$start & end %in% lemmed_ss$end)


verb_phrases2 %>% 
  filter(ngram>=3) %>%
  merge(txt_freq(verb_phrases2$keyword), by.x= "keyword", by.y = "key") %>% 
  arrange(desc(freq)) %>% 
  #group_by(keyword,freq) %>% summari
  distinct(keyword,freq)

# verbs$verb[verbs$character=="QP"]

```


```{r}
verb_phrases <- lemmed %>%
  group_by(start, end) %>%
  summarise(string = c(word)) %>%
  rowwise()

  nest_by(ngram, pattern, start, end)
```




## Trying to find cooccurrences amoung slected verbs and nouns — some uncertainty introduced since lemma can be both noun and a verb.
```{r}
cooc <- cooccurrence(x = subset(up3, upos %in% c("NOUN", "VERB")), 
                     term = "lemma", 
                     group = c("doc", "gcode", "paragraph_id", "sentence_id"))
cooc %<>% subset(term1 %in% verbs$verb | term2 %in% verbs$verb) 
cooc %>% str()
```


```{r}
wordnetwork <- head(cooc, 50)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
  geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "lightblue") +
  geom_node_text(aes(label = name), col = "blue", size = 4) +
  theme_graph(base_family = "Sans Serif") +
  theme(legend.position = "none") +
  labs(title = "Cooccurrences within sentence", subtitle = "Nouns & Verbs")

```
## Merging done badly
```{r}
tempds<-up2 %>% select(doc, gcode, paragraph_id, sentence_id, freq, freq_pct, lemma, character) 

tempds <- left_join(up3, tempds, by=c("doc", "gcode", "paragraph_id", "sentence_id"))

tempds %<>% distinct(doc, gcode, paragraph_id, sentence_id, token_id, .keep_all = T)
```


## Top 5 verbs in every verb group
```{r}
t5v <- up2 %>% 
  select(lemma,character,freq) %>%
  distinct() %>% 
  group_by(character) %>% 
  arrange(desc(freq)) %>% 
  slice(1:5) %>% 
  ungroup()
t5v
```

## Most cooccured nouns with verbs by character in 401 code

```{r}
freq <- tempds %>%
  filter(gcode=="401") %>% 
  subset(lemma.y %in% t5v$lemma & upos %in% "NOUN") %>%
  group_by(lemma.y) %>% count(lemma.x) %>% mutate(freq = n / sum(n)) %>% ungroup()
freq
freq %>% 
  group_by(lemma.y) %>% arrange(desc(freq)) %>% slice(1:6) %>% ungroup() %>% 
  merge(t5v %>% select(lemma,character), by.x="lemma.y", by.y="lemma") %>% 
  mutate(lemma.y = str_c("(",character,") ",lemma.y, sep=""),
         noun = reorder_within(lemma.x, by = n, within = lemma.y)) %>%
  ggplot(aes(x = noun, y = n, fill = character)) +
    geom_col(show.legend = FALSE) +
    labs(x = "Nouns", y = "frequency") +
    facet_wrap(~lemma.y, ncol = 4, scales = "free_y") +
    coord_flip() +
    scale_x_reordered() +
    #facet_grid(~character)+
    theme(axis.text.y = element_text(size = 6))

```

## Top Nouns cooccured with verbs by every verb-character
```{r}
 
tempds %>%
  subset(lemma.y %in% t5v$lemma & upos %in% "NOUN") %>%
  group_by(gcode,character) %>% count(lemma.x) %>% mutate(freq = n / sum(n)) %>% ungroup() %>% 
  filter(gcode=="401") %>% 
  group_by(character) %>% arrange(desc(n)) %>% slice(1:12) %>% ungroup() %>%
  #mutate(noun = reorder_within(lemma.x, by = n, within = character),) %>%
  ggplot(aes(x = lemma.x, y = n, fill = character)) +
    geom_col(show.legend = FALSE) +
    labs(x = "Nouns", y = "frequency") +
    #facet_wrap(~character, ncol = 6, scales = "free_y") +
    coord_flip() +
    #scale_x_reordered() +
    facet_grid(~character)+
    theme(axis.text.y = element_text(size = 6))

```






```{r}

library(igraph)
library(ggraph)
library(ggplot2)
plot_annotation <- function(x, size = 3){
  stopifnot(is.data.frame(x) & all(c("sentence_id", "token_id", "head_token_id", "dep_rel",
                                     "token", "lemma", "upos", "xpos", "feats") %in% colnames(x)))
  x <- x[!is.na(x$head_token_id), ]
  x <- x[x$sentence_id %in% min(x$sentence_id), ]
  edges <- x[x$head_token_id != 0, c("token_id", "head_token_id", "dep_rel")]
  edges$label <- edges$dep_rel
  g <- graph_from_data_frame(edges,
                             vertices = x[, c("token_id", "token", "lemma", "upos", "xpos", "feats")],
                             directed = TRUE)
  ggraph(g, layout = "linear") +
    geom_edge_arc(ggplot2::aes(label = dep_rel, vjust = -0.20),
                  arrow = grid::arrow(length = unit(4, 'mm'), ends = "last", type = "closed"),
                  end_cap = ggraph::label_rect("wordswordswords"),
                  label_colour = "red", check_overlap = TRUE, label_size = size) +
    geom_node_label(ggplot2::aes(label = token), col = "darkgreen", size = size, fontface = "bold") +
    geom_node_text(ggplot2::aes(label = upos), nudge_y = -0.35, size = size) +
    theme_graph(base_family = "Arial Narrow") +
    labs(title = "udpipe output", subtitle = "tokenisation, parts of speech tagging & dependency relations")
}

up3[645:663,]
plot_annotation(up3[645:663,])


```
```{r}

up4 <- up2 %>% filter(dep_rel=="root")

up4 %>% group_by(character) %>% count()

```







```{r}
undata2 <- pre_pro_rep %>% 
  unnest_tokens(bigram,text,token = "ngrams", n=2) #%>% 
  #mutate(bigram = bigram %>% str_remove_all("[^[:alnum:]]")) %>% 

undata1 <- pre_pro_rep %>% 
  unnest_tokens(word,text,token = "ngrams", n=1) %>% 
  mutate(word = word %>% str_remove_all("[^[:alnum:]]")) %>% 
  mutate(word = word %>% str_remove_all(rem_dig)) %>% 
  mutate(word = word %>% str_remove_all("null")) %>% 
  filter(!is.na(word)) %>% 
  anti_join(stop_words, by = "word")

bigrams_separated <- undata2 %>% separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>% unite(bigram, word1, word2, sep = " ")
bigrams_united %<>% add_count(X,bigram) %>% bind_tf_idf(term = bigram, document = X, n = n)

words <- undata1 %>% add_count(X,word) %>% bind_tf_idf(term = word, document = X, n = n)

bigrams_united %>% count(bigram, wt = tf_idf, sort = TRUE) %>% head(25)

top_by_g <- bigrams_united %>% group_by(gcode) %>% count(bigram, wt = tf_idf, sort = TRUE, name = "tf_idf") %>% dplyr::slice(1:12) %>% ungroup()

top_by_g <- words %>% group_by(gcode) %>% count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>% dplyr::slice(1:5) %>% ungroup()

top_by_y <- bigrams_united %>% group_by(rep_year) %>% count(bigram, wt = tf_idf, sort = TRUE, name = "tf_idf") %>% dplyr::slice(1:12) %>% ungroup()

# top bigrams in each g_code
top_by_g %>% mutate(bigram = reorder_within(bigram, by = tf_idf, within = gcode)) %>%
  ggplot(aes(x = bigram, y = tf_idf, fill = gcode)) +
    geom_col(show.legend = FALSE) +
    labs(x = NULL, y = "tf-idf") +
    facet_wrap(~gcode, ncol = 3, scales = "free") +
    coord_flip() +
    scale_x_reordered() +
    theme(axis.text.y = element_text(size = 6))


top_by_g %>% mutate(word = reorder_within(word, by = tf_idf, within = gcode)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = gcode)) +
    geom_col(show.legend = FALSE) +
    labs(x = NULL, y = "tf-idf") +
    facet_wrap(~gcode, ncol = 3, scales = "free") +
    coord_flip() +
    scale_x_reordered() +
    theme(axis.text.y = element_text(size = 8))

```

```{r}
library(textdata)
library(recipes)
library(tidymodels)
library(textrecipes)
library(themis)
library(tune)
library(glmnet)
library(ranger)
```


```{r}
mdata <- ocpt2 %>% ungroup() %>%  select(gcode,text)

#glove6b <- embedding_glove6b(dimensions = 100)

set.seed(1234)

rem_punct <- regex("[[:punct:]]")
rem_dig <- regex("[[:digit:]]")

mdata %<>% mutate(text=str_squish(text),
                  text=str_remove_all(text,rem_punct),
                  text=str_remove_all(text,rem_dig))
```


```{r}
mdata %>% group_by(gcode) %>% count(sort = T)# %>% slice(1:8) %>% ungroup() # %>% ggplot(aes(x = gcode, y = n)) + geom_col()

mdata %<>% filter(gcode %in% c("401","403","404", "407", "405","413","414","417","416","999"))
# mdata %<>% filter(!gcode %in% c("409","410","411"))
# mdata %<>% drop_na()
mdata %<>% mutate(gcode=as.factor(gcode))

#downsampling

mdata %>% filter(gcode %in% c("999"))

no_label_index <- which(mdata$gcode==999)
# labeled_index <- which(!mdata$gcode==999)
random_indexes <- sample(1:length(no_label_index), length(no_label_index)-200, replace=F)
# random_downsample <- no_label_index[-c(random_indexes)]

mdata <-mdata[-c(no_label_index[c(random_indexes)]),]
```


```{r}
write.csv(mdata, "mdata_2.csv")
```


```{r}
tidy_split <- initial_split(mdata, strata = gcode, prop = 0.7)
train_data <- training(tidy_split)
test_data <- testing(tidy_split)
tidy_split
```


```{r}
train_data <- recipe(gcode~., data = train_data) %>% themis::step_upsample(gcode) %>% prep() %>% juice()


train_data %<>% mutate(text=as.character(text))
```


```{r}
data_res <- train_data %>% vfold_cv(strata = gcode, v = 10, repeats = 3)
data_res <- vfold_cv(train_data)
```


```{r}
tf_idf_rec <- recipe(gcode ~ ., data = train_data) %>%
  step_tokenize(text) %>%
  step_stem(text) %>%
  step_stopwords(text) %>%
  step_tokenfilter(text, max_tokens = 1000) %>%
  step_tfidf(all_predictors())

tf_idf_data <- tf_idf_rec %>% prep() %>% juice()
```


```{r}
hash_rec <- recipe(gcode~., data = train_data) %>%
  step_tokenize(text) %>%
  step_stem(text) %>%
  step_stopwords(text) %>%
  step_tokenfilter(text, max_tokens = 1000) %>%
  step_texthash(text, num_terms = 100)

hash_rec %>% prep() %>% juice()

```


```{r}
model_lg <- multinom_reg() %>%
  set_args(penalty=tune(), mixture=NULL) %>% 
  set_engine("glmnet") %>%
  set_mode("classification")

model_rf <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

```



```{r}
logistic_grid <- grid_regular(parameters(model_lg), levels = 3)
model_control <- control_grid(save_pred = TRUE)
model_metrics <- metric_set(accuracy, roc_auc)
```


```{r}
linear_tf_res <- tune_grid(model_lg, tf_idf_rec, grid = logistic_grid, control = model_control, metrics = model_metrics, resamples = data_res)
linear_hash_res <- tune_grid(model_lg, hash_rec, grid = logistic_grid, control = model_control, metrics = model_metrics, resamples = data_res)

```



```{r}
workflow_general_tf <- workflow() %>% add_recipe(tf_idf_rec)
workflow_lg_tf <- workflow_general_tf %>% add_model(model_lg)
workflow_rf_tf <- workflow_general_tf %>% add_model(model_rf)

workflow_general_hash <- workflow() %>% add_recipe(hash_rec)
workflow_lg_hash <- workflow_general_hash %>% add_model(model_lg)
workflow_rf_hash <- workflow_general_hash %>% add_model(model_rf)
```


```{r}
linear_tf_res %>% autoplot()
best_param_linear_tf_res <- linear_tf_res %>% select_best(metric = 'accuracy')
best_param_linear_tf_res
workflow_final_lg_tf <- workflow_lg_tf %>%
  finalize_workflow(parameters = best_param_linear_tf_res)

log_res_tf <- workflow_final_lg_tf %>%
  fit_resamples(resamples = data_res,
                metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens), control = control_resamples(save_pred = TRUE))

log_res_tf %>% collect_metrics(summarize = TRUE)
```


```{r}
linear_hash_res %>% autoplot()
best_param_linear_hash_res <- linear_hash_res %>% select_best(metric = 'accuracy')
best_param_linear_hash_res

workflow_final_lg_hash <- workflow_lg_hash %>%
  finalize_workflow(parameters = best_param_linear_hash_res)

log_res_hash <- workflow_final_lg_hash %>%
  fit_resamples( resamples = data_res,
                 metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens), control = control_resamples(save_pred = TRUE))

log_res_hash %>% collect_metrics(summarize = TRUE)
```

```{r}

rf_res_hash <- workflow_rf_hash %>%
  fit_resamples(resamples = data_res,
                metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens),
                control = control_resamples( save_pred = TRUE))

rf_res_hash %>% collect_metrics(summarize = TRUE)

```


```{r}
rf_res_tf <- workflow_rf_tf %>%
  fit_resamples(resamples = data_res,
                metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens),
                control = control_resamples( save_pred = TRUE))
rf_res_tf %>% collect_metrics(summarize = TRUE)

```
```{r}
log_metrics_tf <- log_res_tf %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Logistic Regression TF-idf")
log_metrics_hash <- log_res_hash %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Logistic Regression Hash")
rf_metrics_tf <- rf_res_tf %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Random Forest TF-idf")
rf_metrics_hash <- rf_res_hash %>% collect_metrics(summarise = TRUE) %>% mutate(model = "Random Forest Hash")


model_compare <- bind_rows(
  log_metrics_tf, 
  log_metrics_hash, 
  rf_metrics_tf, 
  rf_metrics_hash)



rf_stat1 <- rf_res_tf %>% collect_metrics(summarize = TRUE)
rf_stat1
```


```{r}
model_comp <- model_compare %>%
  select(model, .metric, mean, std_err) %>%
  pivot_wider(names_from = .metric, values_from = c(mean, std_err))


model_comp %>%
  arrange(mean_f_meas) %>%
    mutate(model = fct_reorder(model, mean_f_meas)) %>%
      ggplot(aes(model, mean_f_meas, fill=model)) +
      geom_col() + coord_flip() +
      scale_fill_brewer(palette = "YlGn") +
      geom_text(size = 3, aes(label = round(mean_f_meas, 2), y = mean_f_meas + 0.08), vjust = 1)

```

```{r}
rf_pred_tf <- rf_res_tf %>%
  collect_predictions()

rf_pred_tf %>% conf_mat(gcode, .pred_class)

rf_pred_tf %>% conf_mat(gcode, .pred_class) %>% autoplot(type = "heatmap")
```

```{r}

last_fit_rf <- last_fit(workflow_rf_tf,
                        split = tidy_split,
                        metrics = metric_set(recall, precision, f_meas, accuracy, kap, roc_auc, sens))

rf_stat2 <- last_fit_rf %>% collect_metrics()
rf_stat2

last_fit_rf %>% collect_predictions() %>% conf_mat(gcode, .pred_class) %>% autoplot(type = "heatmap")

myt <- rf_stat1 %>% left_join(rf_stat2, by=".metric") %>% select(.metric, mean, std_err, .estimate)
myt

```

```{r}
library(stargazer)
library(xtable)
#stargazer(myt, type = "latex", out = "models.tex")
stargazer(myt, type = "text")
print(xtable(myt, type = "latex"), file = "filename2.tex")

```


